# -*-coding:utf-8 -*-
'爬虫代码'

import requests
from lxml import etree
import os


# 爬虫函数
def getPageInfo():
    for i in range(1, 6):
        result = ""
        # 要爬取的网页，起点中文网排行版
        url = 'https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page={}.html'
        url = url.format(i)
        response = requests.get(url=url)
        response.encoding = 'utf-8'
        tree = etree.HTML(response.text)
        list = tree.xpath("//ul[@class='all-img-list cf']/li")
        for li in list:
            # 文章标题
            title = li.xpath("./div[2]/h4/a/text()")[0]
            # 文章作者
            author = li.xpath("./div[2]/p[1]/a[1]/text()")[0]
            # 文章类型（这个来做排行版）
            type = li.xpath("./div[2]/p[1]/a[2]/text()")[0]
            # 文章具体状态
            detail = li.xpath("./div[2]/p[1]/a[3]/text()")[0]
            # 文章情况（连载/完结）
            status = li.xpath("./div[2]/p[1]/span/text()")[0]
            # 组合数据
            result += title + ';' + author + ';' + type + ';' + detail + ';' + status + '\n'
            # print("title = ", title, "author = ", author, "type = ", type)
        file = os.getcwd(
        ) + '/python_homework/export_file/汤学希+201827010320+数据源文件.txt'
        output = open(file, mode='a', encoding='utf-8')
        output.write(result)
        output.close()
    print("爬取成功")


if __name__ == '__main__':
    getPageInfo()
